Unsupervised Learning Assignment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#For splitting data
from sklearn.model_selection import train_test_split
#For scaling the data to remove the importance of units
from sklearn.preprocessing import StandardScaler
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix
In [2]:
#importing file into a dataset
ds_vehicle= pd.read_csv("vehicle-1.csv")
In [3]:
#general information about data
ds_vehicle.head(10)
Out[3]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
5 107 NaN 106.0 172.0 50.0 6 255.0 26.0 28.0 169 280.0 957.0 264.0 85.0 5.0 9.0 181.0 183 bus
6 97 43.0 73.0 173.0 65.0 6 153.0 42.0 19.0 143 176.0 361.0 172.0 66.0 13.0 1.0 200.0 204 bus
7 90 43.0 66.0 157.0 65.0 9 137.0 48.0 18.0 146 162.0 281.0 164.0 67.0 3.0 3.0 193.0 202 van
8 86 34.0 62.0 140.0 61.0 7 122.0 54.0 17.0 127 141.0 223.0 112.0 64.0 2.0 14.0 200.0 208 van
9 93 44.0 98.0 NaN 62.0 11 183.0 36.0 22.0 146 202.0 505.0 152.0 64.0 4.0 14.0 195.0 204 car
In [4]:
ds_vehicle.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
compactness                    846 non-null int64
circularity                    841 non-null float64
distance_circularity           842 non-null float64
radius_ratio                   840 non-null float64
pr.axis_aspect_ratio           844 non-null float64
max.length_aspect_ratio        846 non-null int64
scatter_ratio                  845 non-null float64
elongatedness                  845 non-null float64
pr.axis_rectangularity         843 non-null float64
max.length_rectangularity      846 non-null int64
scaled_variance                843 non-null float64
scaled_variance.1              844 non-null float64
scaled_radius_of_gyration      844 non-null float64
scaled_radius_of_gyration.1    842 non-null float64
skewness_about                 840 non-null float64
skewness_about.1               845 non-null float64
skewness_about.2               845 non-null float64
hollows_ratio                  846 non-null int64
class                          846 non-null object
dtypes: float64(14), int64(4), object(1)
memory usage: 125.7+ KB
In [5]:
ds_vehicle.shape
Out[5]:
(846, 19)
In [6]:
#Checking for null values
print("Any null values in the dataset :",ds_vehicle.isnull().values.any())
ds_vehicle.isnull().sum()
Any null values in the dataset : True
Out[6]:
compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64
In [7]:
#List all the rows having missing value in any of the single or multiple columns
#Columns having missing values
missing_values_cols=ds_vehicle.columns[ds_vehicle.isnull().any()]
ds_vehicle[ds_vehicle.isnull().any(axis=1)][missing_values_cols].head()
Out[7]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
5 NaN 106.0 172.0 50.0 255.0 26.0 28.0 280.0 957.0 264.0 85.0 5.0 9.0 181.0
9 44.0 98.0 NaN 62.0 183.0 36.0 22.0 202.0 505.0 152.0 64.0 4.0 14.0 195.0
19 56.0 100.0 215.0 NaN 208.0 32.0 24.0 227.0 651.0 223.0 74.0 6.0 5.0 186.0
35 46.0 NaN 172.0 67.0 157.0 43.0 20.0 170.0 363.0 184.0 67.0 17.0 7.0 192.0
66 43.0 68.0 125.0 57.0 149.0 46.0 19.0 169.0 323.0 172.0 NaN NaN 18.0 179.0
In [8]:
#Label encode the target class
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
ds_vehicle["class"] = labelencoder.fit_transform(ds_vehicle['class'])
ds_vehicle["class"].value_counts()
Out[8]:
1    429
0    218
2    199
Name: class, dtype: int64

Before proceeding , we are going to fill in the missing values and then check the correlation between values and drop them

In [9]:
#Missing treatment for circularity
ds_vehicle[ds_vehicle['circularity'].isnull()][missing_values_cols]
Out[9]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
5 NaN 106.0 172.0 50.0 255.0 26.0 28.0 280.0 957.0 264.0 85.0 5.0 9.0 181.0
105 NaN 103.0 202.0 64.0 220.0 30.0 25.0 NaN 711.0 214.0 73.0 11.0 NaN 188.0
118 NaN NaN 128.0 56.0 150.0 46.0 19.0 168.0 324.0 173.0 82.0 9.0 14.0 180.0
266 NaN 65.0 116.0 53.0 152.0 45.0 19.0 175.0 335.0 NaN 85.0 5.0 4.0 179.0
396 NaN 106.0 177.0 51.0 256.0 26.0 28.0 285.0 966.0 261.0 87.0 11.0 2.0 182.0
In [10]:
#Dropping rows 105,118 and 266 since they have multiple NaN values 
ds_vehicle.drop([105,118,266], inplace=True)
In [11]:
ds_vehicle.loc[5].loc['class'],ds_vehicle.loc[396].loc['class']
Out[11]:
(0.0, 0.0)
In [12]:
#Since from the data we observe both of the remaining rows are of the catefory "bus"
#we will be replacing it with the mean of the same 
ds_vehicle['circularity'].fillna(ds_vehicle['circularity'][ds_vehicle['class']==0].median(), inplace=True)
In [13]:
ds_vehicle[ds_vehicle['circularity'].isnull()][missing_values_cols]
Out[13]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
In [14]:
#Moving to distance_circularity, we repeat the same process for the rest of the missing values in differnt attributes
ds_vehicle[ds_vehicle['distance_circularity'].isnull()][missing_values_cols]
Out[14]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
35 46.0 NaN 172.0 67.0 157.0 43.0 20.0 170.0 363.0 184.0 67.0 17.0 7.0 192.0
207 42.0 NaN 121.0 55.0 149.0 46.0 19.0 167.0 323.0 NaN 85.0 1.0 6.0 179.0
319 51.0 NaN 194.0 60.0 220.0 30.0 25.0 247.0 731.0 209.0 80.0 7.0 7.0 188.0
In [15]:
#Since there is more than 1 missing values in 207 row, we drop it and replace rest of the missing values in distance_circularity by median
ds_vehicle.drop(207, inplace=True)
In [16]:
#As observed, row 35 and 319 are of different categories so we are going to replace the missing values by their individual medians
ds_vehicle.loc[35]=ds_vehicle.loc[35].replace(np.nan,ds_vehicle['distance_circularity'][ds_vehicle['class']==2].median())
ds_vehicle.loc[319]=ds_vehicle.loc[319].replace(np.nan,ds_vehicle['distance_circularity'][ds_vehicle['class']==0].median())
In [17]:
#Treating missing values for radius_ratio
ds_vehicle[ds_vehicle['radius_ratio'].isnull()][missing_values_cols]
#Since there is no other missing values in other attribute, we will not drop any. Replacing the missing values with the respective categories of vehicles.
Out[17]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
9 44.0 98.0 NaN 62.0 183.0 36.0 22.0 202.0 505.0 152.0 64.0 4.0 14.0 195.0
78 52.0 94.0 NaN 66.0 208.0 31.0 24.0 227.0 666.0 218.0 76.0 11.0 4.0 193.0
159 45.0 75.0 NaN 57.0 150.0 44.0 19.0 170.0 335.0 180.0 66.0 16.0 2.0 193.0
287 43.0 84.0 NaN 55.0 154.0 44.0 19.0 174.0 350.0 164.0 73.0 6.0 2.0 185.0
345 54.0 106.0 NaN 57.0 236.0 28.0 26.0 256.0 833.0 253.0 81.0 6.0 14.0 185.0
467 54.0 104.0 NaN 58.0 215.0 31.0 24.0 221.0 682.0 222.0 75.0 13.0 23.0 186.0
In [18]:
ds_vehicle.loc[[9,78,159,287,345,467]]['class']
#From the data we observe that :
#Row 9,159, 467 are of category 1
#Row 78, 345 are of category 0
#Row 387 is of category  2
Out[18]:
9      1.0
78     0.0
159    1.0
287    2.0
345    0.0
467    1.0
Name: class, dtype: float64
In [19]:
#Replacing missing values according to the category 
ds_vehicle.loc[[9,159,467]]=ds_vehicle.loc[[9,159,467]].replace(np.nan,ds_vehicle['radius_ratio'][ds_vehicle['class']==1].median())
ds_vehicle.loc[[78,345 ]]=ds_vehicle.loc[[ 78,345 ]].replace(np.nan,ds_vehicle['radius_ratio'][ds_vehicle['class']==0].median())
ds_vehicle.loc[287]=ds_vehicle.loc[287].replace(np.nan,ds_vehicle['radius_ratio'][ds_vehicle['class']==2].median())
In [20]:
#Treating missing values for pr.axis_aspect_ratio
ds_vehicle[ds_vehicle['pr.axis_aspect_ratio'].isnull()][missing_values_cols]
Out[20]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
19 56.0 100.0 215.0 NaN 208.0 32.0 24.0 227.0 651.0 223.0 74.0 6.0 5.0 186.0
222 50.0 81.0 197.0 NaN 186.0 34.0 22.0 206.0 531.0 198.0 74.0 NaN 1.0 197.0
In [21]:
#Dropping row 222 since it has more than 1 NaN value, and replacing the remaining with median of the same class
# drop row 222
ds_vehicle.drop(222, inplace=True)
In [22]:
#From data we observe that row 19 is of class 1 ; so finding out median of "pr.axis_aspect_ratio" for class 1 and replacing it
ds_vehicle.loc[19]=ds_vehicle.loc[19].replace(np.nan,ds_vehicle['pr.axis_aspect_ratio'][ds_vehicle['class']==1].median())
In [23]:
#Treating scatter_ratio
ds_vehicle[ds_vehicle['scatter_ratio'].isnull()][missing_values_cols]
Out[23]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
249 34.0 53.0 127.0 58.0 NaN 58.0 17.0 137.0 197.0 127.0 70.0 NaN 20.0 185.0
In [24]:
#Since there are multiple attributes with NaN values, we are going to drop this row.
ds_vehicle.drop(249,inplace=True)
In [25]:
#Treating missing value for elongatedness
ds_vehicle[ds_vehicle['elongatedness'].isnull()][missing_values_cols]
Out[25]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
215 39.0 86.0 169.0 62.0 162.0 NaN 20.0 194.0 388.0 147.0 74.0 1.0 22.0 185.0
In [26]:
ds_vehicle.loc[215]['class']
#From data, we observe the class for this element is 1, so finding out median of class 1 and replacing it as missing value for elongatedness
ds_vehicle.loc[215]=ds_vehicle.loc[215].replace(np.nan,ds_vehicle['elongatedness'][ds_vehicle['class']==1].median())
In [27]:
#Treating missing values for pr.axis_rectangularity
ds_vehicle[ds_vehicle['pr.axis_rectangularity'].isnull()][missing_values_cols]
#Since there is no multiple missing attributes , we are going to replace the missing values with class according median
Out[27]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
70 55.0 98.0 161.0 54.0 215.0 31.0 NaN 226.0 683.0 221.0 76.0 3.0 6.0 185.0
237 45.0 65.0 128.0 56.0 151.0 45.0 NaN 170.0 332.0 186.0 81.0 1.0 10.0 179.0
273 45.0 80.0 162.0 63.0 146.0 46.0 NaN 161.0 316.0 161.0 64.0 5.0 10.0 199.0
In [28]:
#Class of the missing values
ds_vehicle.loc[[70,237,273]]['class']
Out[28]:
70     1.0
237    0.0
273    2.0
Name: class, dtype: float64
In [29]:
ds_vehicle.loc[70]=ds_vehicle.loc[70].replace(np.nan,ds_vehicle['pr.axis_rectangularity'][ds_vehicle['class']==1].median())
ds_vehicle.loc[237]=ds_vehicle.loc[237].replace(np.nan,ds_vehicle['pr.axis_rectangularity'][ds_vehicle['class']==0].median())
ds_vehicle.loc[273]=ds_vehicle.loc[273].replace(np.nan,ds_vehicle['pr.axis_rectangularity'][ds_vehicle['class']==2].median())
In [30]:
#Treating scaled variance
ds_vehicle[ds_vehicle['scaled_variance'].isnull()][missing_values_cols]
#Since none of the missing rows have any other missing values, proceeding with replacing with mean based on class
Out[30]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
372 47.0 87.0 164.0 64.0 156.0 43.0 20.0 NaN 359.0 182.0 68.0 1.0 13.0 192.0
522 36.0 69.0 162.0 63.0 140.0 48.0 18.0 NaN 291.0 126.0 66.0 1.0 38.0 193.0
In [31]:
#Since none of the missing rows have any other missing values, proceeding with replacing with mean based on class
ds_vehicle.loc[[372,522]]['class']
Out[31]:
372    2.0
522    1.0
Name: class, dtype: float64
In [32]:
ds_vehicle.loc[372]=ds_vehicle.loc[372].replace(np.nan,ds_vehicle['scaled_variance'][ds_vehicle['class']==2].median())
ds_vehicle.loc[522]=ds_vehicle.loc[522].replace(np.nan,ds_vehicle['scaled_variance'][ds_vehicle['class']==1].median())
In [33]:
#Treating scaled_variance.1
ds_vehicle[ds_vehicle['scaled_variance.1'].isnull()][missing_values_cols]
Out[33]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
308 51.0 100.0 197.0 59.0 192.0 34.0 22.0 210.0 NaN 195.0 64.0 14.0 3.0 196.0
496 55.0 98.0 224.0 68.0 215.0 31.0 24.0 222.0 NaN 214.0 68.0 2.0 29.0 189.0
In [34]:
#From the data, we observe both the rows have the same class, so replacing with the median 
ds_vehicle.loc[[308,496]]=ds_vehicle.loc[[ 308,496]].replace(np.nan,ds_vehicle['scaled_variance.1'][ds_vehicle['class']==1].median())
In [35]:
#Treating dataframe for missing values in scaled_radius_of_gyration.1
ds_vehicle[ds_vehicle['scaled_radius_of_gyration.1'].isnull()][missing_values_cols]
Out[35]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
66 43.0 68.0 125.0 57.0 149.0 46.0 19.0 169.0 323.0 172.0 NaN NaN 18.0 179.0
77 40.0 62.0 140.0 62.0 150.0 45.0 19.0 165.0 330.0 173.0 NaN 2.0 3.0 180.0
192 43.0 76.0 149.0 57.0 149.0 44.0 19.0 172.0 335.0 176.0 NaN 14.0 0.0 189.0
329 38.0 80.0 169.0 59.0 161.0 41.0 20.0 186.0 389.0 137.0 NaN 5.0 15.0 192.0
In [36]:
#Since row 66 has multiple missing attributes, we will drop it.
#For the rest of the rows,we proceed with median replacement based on class
ds_vehicle.drop(66, inplace=True)
ds_vehicle.loc[[77,192,329]]['class']
Out[36]:
77     1.0
192    1.0
329    1.0
Name: class, dtype: float64
In [37]:
#Since all the rows belong to class 1 of vehicles, we find the median and replace
ds_vehicle.loc[[77,192,329]]=ds_vehicle.loc[[ 77,192,329]].replace(np.nan,ds_vehicle['scaled_radius_of_gyration.1'][ds_vehicle['class']==1].median())
In [38]:
#Treating data for skewness_about attribute
ds_vehicle[ds_vehicle['skewness_about'].isnull()][missing_values_cols]
Out[38]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
141 42.0 63.0 125.0 55.0 149.0 46.0 19.0 166.0 320.0 172.0 86.0 NaN 7.0 179.0
177 44.0 72.0 160.0 66.0 144.0 46.0 19.0 166.0 312.0 169.0 69.0 NaN 1.0 191.0
285 48.0 85.0 189.0 64.0 169.0 39.0 20.0 188.0 427.0 190.0 64.0 NaN 5.0 195.0
In [39]:
ds_vehicle.loc[[141,177,285]]['class']
Out[39]:
141    0.0
177    0.0
285    1.0
Name: class, dtype: float64
In [40]:
ds_vehicle.loc[[141,177]]=ds_vehicle.loc[[141,177]].replace(np.nan,ds_vehicle['skewness_about'][ds_vehicle['class']==0].median())
ds_vehicle.loc[[285]]=ds_vehicle.loc[[285]].replace(np.nan,ds_vehicle['skewness_about'][ds_vehicle['class']==1].median())
In [41]:
#Treating dataframe for missing value in skewness_about.1
ds_vehicle[ds_vehicle['skewness_about.1'].isnull()][missing_values_cols]
#No missing values as we have already dropped the corresponding row in our earlier treatments.
Out[41]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
In [42]:
#Treating data for skewness_about.2
ds_vehicle[ds_vehicle['skewness_about.2'].isnull()][missing_values_cols]
Out[42]:
circularity distance_circularity radius_ratio pr.axis_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2
419 34.0 72.0 144.0 56.0 133.0 50.0 18.0 158.0 263.0 125.0 63.0 5.0 20.0 NaN
In [43]:
#From the data, we observe it is of the class 1
ds_vehicle.loc[[419]]=ds_vehicle.loc[[419]].replace(np.nan,ds_vehicle['skewness_about.2'][ds_vehicle['class']==1].median())
#Replacing the data with the median
In [44]:
#Checking for the missing values after treatment
ds_vehicle.isnull().sum()
Out[44]:
compactness                    0
circularity                    0
distance_circularity           0
radius_ratio                   0
pr.axis_aspect_ratio           0
max.length_aspect_ratio        0
scatter_ratio                  0
elongatedness                  0
pr.axis_rectangularity         0
max.length_rectangularity      0
scaled_variance                0
scaled_variance.1              0
scaled_radius_of_gyration      0
scaled_radius_of_gyration.1    0
skewness_about                 0
skewness_about.1               0
skewness_about.2               0
hollows_ratio                  0
class                          0
dtype: int64
In [45]:
ds_vehicle[ds_vehicle.isnull().any(axis=1)][missing_values_cols].shape
#Confirming that no NaN values are there in dataset
Out[45]:
(0, 14)
In [46]:
ds_vehicle["class"].value_counts()
Out[46]:
1.0    427
0.0    213
2.0    199
Name: class, dtype: int64
In [47]:
# 5 M's of the data
ds_vehicle.describe()
Out[47]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
count 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000
mean 93.709178 44.839094 82.138856 169.117998 61.710369 8.578069 168.910608 40.905840 20.584029 148.013111 188.753874 440.063170 174.697259 72.398093 6.359952 12.617402 188.961859 195.692491 0.983313
std 8.218746 6.144567 15.744684 33.346151 7.900381 4.617162 33.255794 7.803796 2.591483 14.522752 31.419128 176.579093 32.601944 7.467754 4.916886 8.945485 6.133439 7.415286 0.700976
min 73.000000 33.000000 40.000000 104.000000 47.000000 2.000000 112.000000 26.000000 17.000000 118.000000 130.000000 184.000000 109.000000 59.000000 0.000000 0.000000 176.000000 181.000000 0.000000
25% 87.500000 40.000000 70.000000 141.000000 57.000000 7.000000 146.000000 33.000000 19.000000 137.000000 167.000000 318.000000 149.000000 67.000000 2.000000 5.000000 184.000000 191.000000 0.000000
50% 93.000000 44.000000 80.000000 168.000000 61.000000 8.000000 157.000000 43.000000 20.000000 146.000000 179.000000 365.000000 174.000000 71.000000 6.000000 11.000000 189.000000 197.000000 1.000000
75% 100.000000 49.000000 98.000000 195.000000 65.000000 10.000000 198.000000 46.000000 23.000000 159.500000 217.000000 587.000000 198.000000 75.000000 9.000000 19.000000 193.000000 201.000000 1.000000
max 119.000000 59.000000 112.000000 333.000000 138.000000 55.000000 265.000000 61.000000 29.000000 188.000000 320.000000 1018.000000 268.000000 135.000000 22.000000 41.000000 206.000000 211.000000 2.000000
In [48]:
#Visualizing the dataframe
sns.pairplot(ds_vehicle,diag_kind="kde",hue="class")
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:487: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
Out[48]:
<seaborn.axisgrid.PairGrid at 0xe9aa43f6c8>
In [49]:
ds_vehicle.corr()
Out[49]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
compactness 1.000000 0.686604 0.788796 0.686633 0.086843 0.146984 0.812285 -0.787297 0.812485 0.675609 0.762360 0.816428 0.584940 -0.247175 0.235229 0.160276 0.292137 0.359007 -0.040613
circularity 0.686604 1.000000 0.792604 0.622413 0.152627 0.251364 0.850361 -0.820740 0.845617 0.962623 0.793571 0.839828 0.926730 0.053306 0.144388 -0.008807 -0.110447 0.042572 -0.160483
distance_circularity 0.788796 0.792604 1.000000 0.767319 0.155430 0.263508 0.905459 -0.910705 0.892616 0.772796 0.860497 0.887093 0.704378 -0.225976 0.111903 0.269172 0.142119 0.329912 -0.068718
radius_ratio 0.686633 0.622413 0.767319 1.000000 0.662058 0.450006 0.735301 -0.789860 0.708114 0.568592 0.794322 0.720586 0.537304 -0.175956 0.048621 0.177042 0.375760 0.464799 -0.193553
pr.axis_aspect_ratio 0.086843 0.152627 0.155430 0.662058 1.000000 0.648635 0.101309 -0.180390 0.075591 0.125059 0.271018 0.086417 0.121146 0.158853 -0.059920 -0.033179 0.235596 0.262996 -0.104519
max.length_aspect_ratio 0.146984 0.251364 0.263508 0.450006 0.648635 1.000000 0.165492 -0.178654 0.160343 0.305558 0.317904 0.142727 0.189340 0.299332 0.014740 0.042487 -0.028085 0.142175 0.206140
scatter_ratio 0.812285 0.850361 0.905459 0.735301 0.101309 0.165492 1.000000 -0.973064 0.991601 0.809601 0.950319 0.995708 0.800356 -0.024795 0.072716 0.214385 0.000387 0.114037 -0.294253
elongatedness -0.787297 -0.820740 -0.910705 -0.789860 -0.180390 -0.178654 -0.973064 1.000000 -0.949765 -0.772895 -0.936632 -0.955574 -0.764538 0.101778 -0.049746 -0.190859 -0.108178 -0.210594 0.345691
pr.axis_rectangularity 0.812485 0.845617 0.892616 0.708114 0.075591 0.160343 0.991601 -0.949765 1.000000 0.811122 0.936274 0.991342 0.797119 -0.011843 0.081910 0.217510 -0.026600 0.091823 -0.264618
max.length_rectangularity 0.675609 0.962623 0.772796 0.568592 0.125059 0.305558 0.809601 -0.772895 0.811122 1.000000 0.743895 0.796129 0.865898 0.043194 0.134779 0.004044 -0.109621 0.072563 -0.033269
scaled_variance 0.762360 0.793571 0.860497 0.794322 0.271018 0.317904 0.950319 -0.936632 0.936274 0.743895 1.000000 0.948142 0.777878 0.116022 0.036273 0.199991 0.007859 0.080591 -0.319321
scaled_variance.1 0.816428 0.839828 0.887093 0.720586 0.086417 0.142727 0.995708 -0.955574 0.991342 0.796129 0.948142 1.000000 0.796536 -0.014269 0.075778 0.204644 0.000832 0.098031 -0.294238
scaled_radius_of_gyration 0.584940 0.926730 0.704378 0.537304 0.121146 0.189340 0.800356 -0.764538 0.797119 0.865898 0.777878 0.796536 1.000000 0.192406 0.165728 -0.053631 -0.229344 -0.122056 -0.251488
scaled_radius_of_gyration.1 -0.247175 0.053306 -0.225976 -0.175956 0.158853 0.299332 -0.024795 0.101778 -0.011843 0.043194 0.116022 -0.014269 0.192406 1.000000 -0.087463 -0.122582 -0.749611 -0.802296 -0.206859
skewness_about 0.235229 0.144388 0.111903 0.048621 -0.059920 0.014740 0.072716 -0.049746 0.081910 0.134779 0.036273 0.075778 0.165728 -0.087463 1.000000 -0.035825 0.114931 0.095696 0.119462
skewness_about.1 0.160276 -0.008807 0.269172 0.177042 -0.033179 0.042487 0.214385 -0.190859 0.217510 0.004044 0.199991 0.204644 -0.053631 -0.122582 -0.035825 1.000000 0.079010 0.206329 -0.014531
skewness_about.2 0.292137 -0.110447 0.142119 0.375760 0.235596 -0.028085 0.000387 -0.108178 -0.026600 -0.109621 0.007859 0.000832 -0.229344 -0.749611 0.114931 0.079010 1.000000 0.892187 0.059804
hollows_ratio 0.359007 0.042572 0.329912 0.464799 0.262996 0.142175 0.114037 -0.210594 0.091823 0.072563 0.080591 0.098031 -0.122056 -0.802296 0.095696 0.206329 0.892187 1.000000 0.228357
class -0.040613 -0.160483 -0.068718 -0.193553 -0.104519 0.206140 -0.294253 0.345691 -0.264618 -0.033269 -0.319321 -0.294238 -0.251488 -0.206859 0.119462 -0.014531 0.059804 0.228357 1.000000
In [50]:
#Correlation Matrix
corr = ds_vehicle.corr() # correlation matrix

#sns.set_style(style = 'white')  # Setting it to white so that we do not see the grid lines
#plt.figure(figsize=(15,8))
#sns.set_context(font_scale=0.8, rc={"lines.linewidth": 2.0})
#ax = sns.heatmap(
#    corr, 
#    annot=True,
#    fmt = '.1g'
#)
#ax.set_xticklabels(
#    ax.get_xticklabels(),
#    rotation=45,
#    horizontalalignment='right'
#);

lower_triangle = np.tril(corr, k = -1)  # select only the lower triangle of the correlation matrix
mask = lower_triangle == 0  # to mask the upper triangle in the following heatmap

plt.figure(figsize = (15,8))  # setting the figure size
sns.set_style(style = 'white')  # Setting it to white so that we do not see the grid lines
sns.heatmap(lower_triangle, center=0.5, cmap= 'Blues', annot= True, xticklabels = corr.index, yticklabels = corr.columns,
            cbar= False, linewidths= 1, mask = mask)   # Da Heatmap
plt.xticks(rotation = 50)   # Aesthetic purposes
plt.yticks(rotation = 20)   # Aesthetic purposes
plt.show()

Much of the relationships have been clearly explained in the pairplot.

Spread of compactness is least for van. mean compactness is highest for car. For Bus compactness is right skewed indicating that less number of buses have high compactness.

Mean circularity is higher for cars

Mean distance_circularity is also higher for cars

Mean radius_ratio is higher for cars, followed by Bus. It is least for vans pr.axis_aspect_ratio is has almost same distribution for car, van and buses max.length_aspect_ratio is almost same for cars and vans, lower for buses

Mean scatter ratio is highest for cars, followed by bus and van

Mean elomngatedness is highest for vans folowed by bus and car pr.axis_rectangularity is highest for cars, followed by bus and then vans distribution of max.length_rectangularity is almost same for cars, bus and vans

Mean scaled variance is highest for cars followed by bus then vans

Mean scaled variance1 is highest for cars followed by bus then vans 'scaled_radius_of_gyration', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1', 'skewness_about.2', have almost similar distribution for cars, buses and vans. 'hollows_ratio' is lower for buses as compared to cars and vans

Many columns have lonmg tails indicating outliers pr.axis_aspect ratio and radius ratio varies strongly +ve for van. for cars and buses it varies in small range- mostly cpuld like Scatter ratio & Scaled_variance1 has almost perfect positive linear relationship

Many features show high correlation indicating that we need to drop multiple features : USING PCA WILL HELP IN REDUCING THE VARIABLES

In [51]:
df_train, df_test= train_test_split(ds_vehicle, test_size = 0.3, random_state = 1)
df_train.shape, df_test.shape
Out[51]:
((587, 19), (252, 19))
In [52]:
#We will handle outliers in training data and leave in testing data.

# 1. Outliers if close to max value will be replaced with max value of the corresponding class
# 2. If outliers are much above 75% quantile range: mean+2SD, we drop them
# 3. Outlier if close to min value will be replaced by min value of the corresponding class
# 4. If outliers are much lower than 25% quantile range :mean-2SD, we drop them

def outlierCheck(inputSeries):
    
    q1 = inputSeries.quantile(0.25)
    q3 = inputSeries.quantile(0.75)
   
    iqr = q3-q1 #Interquartile range
    low_range  = q1-1.5*iqr
    high_range = q3+1.5*iqr
    outliers_low = inputSeries[(inputSeries < low_range)]
    outliers_high= inputSeries[(inputSeries > high_range)]

    print ("25th Quantile value: ", q1)
    print('Outlier low Count =', outliers_low.count())
    print('List of Low outliers: \n')
    print(outliers_low)

    print ("75th Quantile value: ", q3)
    print('Outlier High Count = ', outliers_high.count())
    print('List of High outliers: \n')
    print(outliers_high)
In [53]:
#Checking outliers in Compactness attribute
sns.boxplot(df_train['compactness'])
#We can see 1 value that is approx to be 120. Since this seems to be very close to the whisker edge, we are keeping it as is.
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c09bec48>
In [54]:
#Cheking outliers in circularity
sns.boxplot(df_train["circularity"])
#No outliers in this attribute
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c09e0ec8>
In [55]:
sns.boxplot(df_train["distance_circularity"])
#As observed, the plot is skewed towards the right, so we can safely say data is more distributed towards the right of mean
Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c0a07fc8>
In [56]:
sns.boxplot(df_train["radius_ratio"])
#There are outliers. So we analyze the data based on our logic defined above and take appropriate decison
Out[56]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c0a3c188>
In [57]:
outlierCheck(df_train["radius_ratio"])
25th Quantile value:  141.0
Outlier low Count = 0
List of Low outliers: 

Series([], Name: radius_ratio, dtype: float64)
75th Quantile value:  194.0
Outlier High Count =  2
List of High outliers: 

388    333.0
37     306.0
Name: radius_ratio, dtype: float64
In [58]:
#Checking the full rows 
df_train.loc[[37,388]]
Out[58]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
37 90.0 48.0 86.0 306.0 126.0 49.0 153.0 44.0 19.0 156.0 272.0 346.0 200.0 118.0 0.0 15.0 185.0 194.0 2.0
388 94.0 47.0 85.0 333.0 138.0 49.0 155.0 43.0 19.0 155.0 320.0 354.0 187.0 135.0 12.0 9.0 188.0 196.0 2.0
In [59]:
#Since both of these are from the same category, we will replace both these values with max'm value acceptable for VAN class
#The maximum value accepted without making as an outlier is 250 (as seen from the plot) ; therefore
df_train.loc[[37,388],"radius_ratio"]=250.0
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:205: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
In [60]:
sns.boxplot(df_train["pr.axis_aspect_ratio"])
Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c12ae908>
In [61]:
#Let us check the outlier in depth
outlierCheck(df_train["pr.axis_aspect_ratio"])
25th Quantile value:  57.0
Outlier low Count = 0
List of Low outliers: 

Series([], Name: pr.axis_aspect_ratio, dtype: float64)
75th Quantile value:  65.0
Outlier High Count =  5
List of High outliers: 

4      103.0
100    126.0
291    102.0
388    138.0
37     126.0
Name: pr.axis_aspect_ratio, dtype: float64
In [62]:
df_train.loc[[4,37,100,291,388]]
Out[62]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
4 85.0 44.0 70.0 205.0 103.0 52.0 149.0 45.0 19.0 144.0 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183.0 0.0
37 90.0 48.0 86.0 250.0 126.0 49.0 153.0 44.0 19.0 156.0 272.0 346.0 200.0 118.0 0.0 15.0 185.0 194.0 2.0
100 82.0 45.0 66.0 252.0 126.0 52.0 148.0 45.0 19.0 144.0 237.0 326.0 185.0 119.0 1.0 1.0 181.0 185.0 0.0
291 89.0 45.0 81.0 246.0 102.0 43.0 155.0 44.0 20.0 160.0 200.0 347.0 177.0 90.0 9.0 17.0 183.0 192.0 2.0
388 94.0 47.0 85.0 250.0 138.0 49.0 155.0 43.0 19.0 155.0 320.0 354.0 187.0 135.0 12.0 9.0 188.0 196.0 2.0
In [63]:
#Since the values are distributed amongst 2 classes, we will check their values
print("Values for class 0 \n")
df_train[df_train["class"]==0]["pr.axis_aspect_ratio"].sort_values( ascending=False).head(5)
#Since the maxm values are within the whisker 76, we are dropping 100 and 4
df_train.drop([4,100], inplace=True)
Values for class 0 

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:4102: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [64]:
print("Values for class 2 \n")
df_train[df_train["class"]==2]["pr.axis_aspect_ratio"].sort_values( ascending=False).head(10)
#Again since there is a big drop from 70 to 102, so we are dropping these values
df_train.drop([388,37,291], inplace=True)
Values for class 2 

In [65]:
sns.boxplot(df_train["max.length_aspect_ratio"])
Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c13c2608>
In [66]:
outlierCheck(df_train["max.length_aspect_ratio"])
25th Quantile value:  7.0
Outlier low Count = 1
List of Low outliers: 

655    2.0
Name: max.length_aspect_ratio, dtype: float64
75th Quantile value:  10.0
Outlier High Count =  2
List of High outliers: 

127    22.0
391    25.0
Name: max.length_aspect_ratio, dtype: float64
In [67]:
#Checking the rows for these outliers :
df_train.loc[[391,127]]
Out[67]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
391 91.0 38.0 70.0 160.0 66.0 25.0 140.0 47.0 18.0 139.0 162.0 296.0 130.0 67.0 4.0 11.0 192.0 202.0 2.0
127 85.0 41.0 66.0 155.0 65.0 22.0 149.0 45.0 19.0 139.0 173.0 330.0 155.0 75.0 6.0 16.0 184.0 191.0 0.0
In [68]:
#Checking values for class 2 for max.length_aspect_ratio attribute
df_train[df_train['class']==2]['max.length_aspect_ratio'].sort_values( ascending=False).head(10)
#There is a major up from 12-25 so dropping these values
df_train.drop(391,inplace=True)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:4102: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [69]:
print("Now values for class 0")
df_train[df_train['class']==0]['max.length_aspect_ratio'].sort_values( ascending=False).head(10)
#There is another drop from 8-22 so dropping that value
df_train.drop(127,inplace=True)
Now values for class 0
In [70]:
sns.boxplot(df_train["scatter_ratio"])
#No visible outlier
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c1411908>
In [71]:
sns.boxplot(df_train["elongatedness"])
#No outlier present
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c1464f48>
In [72]:
sns.boxplot(df_train["pr.axis_rectangularity"])
#No outlier present
Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c173f208>
In [73]:
sns.boxplot(df_train["max.length_rectangularity"])
#No outlier present
Out[73]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c1781b48>
In [74]:
sns.boxplot(df_train["scaled_variance"])
#No outlier present
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c17ff1c8>
In [75]:
sns.boxplot(df_train["scaled_variance.1"])
#As we can see there is 1 outlier
Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c1713f48>
In [76]:
#Checking outlier in scaled.variance.1
outlierCheck(df_train["scaled_variance.1"])
25th Quantile value:  320.0
Outlier low Count = 0
List of Low outliers: 

Series([], Name: scaled_variance.1, dtype: float64)
75th Quantile value:  586.25
Outlier High Count =  1
List of High outliers: 

835    1018.0
Name: scaled_variance.1, dtype: float64
In [77]:
df_train.loc[[835]]
Out[77]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
835 111.0 58.0 105.0 183.0 51.0 6.0 265.0 26.0 29.0 174.0 285.0 1018.0 255.0 85.0 4.0 8.0 181.0 183.0 0.0
In [78]:
df_train[df_train['class']==0]['scaled_variance.1'].sort_values( ascending=False).head(8)
df_train.drop(835,inplace=True)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:4102: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [79]:
sns.boxplot(df_train["scaled_radius_of_gyration.1"])
Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c18b2688>
In [80]:
outlierCheck(df_train["scaled_radius_of_gyration.1"])
25th Quantile value:  67.0
Outlier low Count = 0
List of Low outliers: 

Series([], Name: scaled_radius_of_gyration.1, dtype: float64)
75th Quantile value:  75.0
Outlier High Count =  5
List of High outliers: 

381    88.0
79     88.0
498    88.0
230    89.0
655    90.0
Name: scaled_radius_of_gyration.1, dtype: float64
In [81]:
#As we observe, most of the outliers are present under value 90 and quite near the whisker of 87. 
#Hence we are leaving them as is.
In [82]:
sns.boxplot(df_train["skewness_about"])
Out[82]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c190e5c8>
In [83]:
outlierCheck(df_train['skewness_about'])
25th Quantile value:  2.0
Outlier low Count = 0
List of Low outliers: 

Series([], Name: skewness_about, dtype: float64)
75th Quantile value:  9.0
Outlier High Count =  6
List of High outliers: 

516    22.0
505    21.0
44     20.0
797    21.0
623    22.0
400    21.0
Name: skewness_about, dtype: float64
In [84]:
df_train.loc[[516,505,44,797,623,400]]
Out[84]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
516 92.0 42.0 75.0 172.0 60.0 8.0 147.0 45.0 19.0 142.0 174.0 322.0 160.0 62.0 22.0 10.0 206.0 211.0 1.0
505 115.0 53.0 100.0 205.0 64.0 11.0 220.0 30.0 25.0 166.0 229.0 710.0 214.0 71.0 21.0 11.0 189.0 199.0 1.0
44 119.0 54.0 106.0 220.0 65.0 12.0 213.0 31.0 24.0 167.0 223.0 675.0 232.0 66.0 20.0 1.0 192.0 202.0 1.0
797 111.0 53.0 108.0 211.0 61.0 11.0 207.0 32.0 23.0 167.0 217.0 636.0 216.0 64.0 21.0 2.0 196.0 205.0 1.0
623 106.0 52.0 108.0 207.0 64.0 12.0 221.0 31.0 25.0 168.0 229.0 709.0 200.0 73.0 22.0 38.0 190.0 205.0 1.0
400 107.0 53.0 108.0 213.0 64.0 12.0 206.0 32.0 23.0 163.0 216.0 627.0 202.0 65.0 21.0 22.0 194.0 205.0 1.0
In [85]:
#Since all of these belong to class 1, we will find out the whisker of these values by
df_train[df_train['class']==1]['skewness_about'].sort_values( ascending=False).head(20)
Out[85]:
516    22.0
623    22.0
505    21.0
797    21.0
400    21.0
44     20.0
662    19.0
326    19.0
196    19.0
689    18.0
167    18.0
255    18.0
96     17.0
297    17.0
161    17.0
554    17.0
811    16.0
112    16.0
289    16.0
764    16.0
Name: skewness_about, dtype: float64
In [86]:
#We will replace these values by the nearest whisker value : 18
arr =[516,505,44,797,623,400]
#df_train["skewness_about"].replace(18)
df_train.loc[516]["skewness_about"]=18
df_train.loc[505]["skewness_about"]=18
df_train.loc[44]["skewness_about"]=18
df_train.loc[797]["skewness_about"]=18
df_train.loc[623]["skewness_about"]=18
df_train.loc[400]["skewness_about"]=18
In [87]:
sns.boxplot(df_train["skewness_about.1"])
#Since the outlier is neaer to the whisker, we will let it be.
Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c198a648>
In [88]:
sns.boxplot(df_train["skewness_about.2"])
Out[88]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c19e2848>
In [89]:
outlierCheck(df_train["skewness_about.2"])
25th Quantile value:  185.0
Outlier low Count = 0
List of Low outliers: 

Series([], Name: skewness_about.2, dtype: float64)
75th Quantile value:  193.0
Outlier High Count =  1
List of High outliers: 

516    206.0
Name: skewness_about.2, dtype: float64
In [90]:
df_train.drop(516,inplace=True)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py:4102: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [91]:
sns.boxplot(df_train['hollows_ratio'])
#No outlier
Out[91]:
<matplotlib.axes._subplots.AxesSubplot at 0xe9c1a2f6c8>
In [92]:
#Final shape of training data
df_train.shape
Out[92]:
(578, 19)
In [93]:
dropping_labels=["class","elongatedness","scaled_radius_of_gyration.1","hollows_ratio","skewness_about.1","pr.axis_aspect_ratio"]
X_train=df_train.drop(dropping_labels, axis=1)
y_train=df_train['class']
X_test=df_test.drop(dropping_labels, axis=1)
y_test=df_test['class']
X_train.shape,y_train.shape, X_test.shape, y_test.shape
Out[93]:
((578, 13), (578,), (252, 13), (252,))
In [94]:
from sklearn.svm import SVC

# Building a Support Vector Machine on train data
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train, y_train)

prediction = svc_model .predict(X_test)
In [95]:
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
0.8875432525951558
0.8531746031746031
In [96]:
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
Confusion Matrix:
 [[ 44  18   3]
 [  6 104   1]
 [  2   7  67]]

Working towards PCA

In [97]:
dropping_labels=["class","elongatedness","scaled_radius_of_gyration.1","hollows_ratio","skewness_about.1","pr.axis_aspect_ratio"]
X_train_PCA=df_train.drop(dropping_labels, axis=1)
y_train_PCA=df_train['class']
X_test_PCA=df_test.drop(dropping_labels, axis=1)
y_test_PCA=df_test['class']
X_train_PCA.shape,y_train_PCA.shape, X_test_PCA.shape, y_test_PCA.shape
Out[97]:
((578, 13), (578,), (252, 13), (252,))
In [98]:
sc = StandardScaler()
In [99]:
sc.fit(X_train_PCA) # Fit scaler in train set
Out[99]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [100]:
# transform train set
#Transform X_train
X_train_std=sc.transform(X_train_PCA)
#Transform X_test ( with same fit as train) to prevent data leak
X_test_std=sc.transform(X_test_PCA)
In [101]:
# Covariance Matrix
cov_matrix = np.cov(X_train_std.T)

print('Covariance Matrix \n%s', cov_matrix)
Covariance Matrix 
%s [[ 1.0017331   0.67948799  0.78642956  0.73573259  0.50393448  0.81454299
   0.81317683  0.67744076  0.79039161  0.81945387  0.57450366  0.22950946
   0.30578979]
 [ 0.67948799  1.0017331   0.77849182  0.63572264  0.5610998   0.8413649
   0.836861    0.96403517  0.79279758  0.83000102  0.92621663  0.15567302
  -0.09812221]
 [ 0.78642956  0.77849182  1.0017331   0.81204688  0.6668931   0.90561627
   0.89427155  0.75991317  0.88213638  0.88845363  0.69838314  0.13302493
   0.16281709]
 [ 0.73573259  0.63572264  0.81204688  1.0017331   0.47493742  0.78434207
   0.75569174  0.57260182  0.78677447  0.7705363   0.54924869  0.05982616
   0.44801852]
 [ 0.50393448  0.5610998   0.6668931   0.47493742  1.0017331   0.48654835
   0.48389033  0.6353582   0.39933179  0.44688851  0.4026556   0.14046405
   0.10275195]
 [ 0.81454299  0.8413649   0.90561627  0.78434207  0.48654835  1.0017331
   0.99357665  0.80864188  0.98067694  0.9973901   0.79632212  0.0973812
   0.0269373 ]
 [ 0.81317683  0.836861    0.89427155  0.75569174  0.48389033  0.99357665
   1.0017331   0.81115278  0.96810911  0.9930931   0.7937835   0.10171839
  -0.00144679]
 [ 0.67744076  0.96403517  0.75991317  0.57260182  0.6353582   0.80864188
   0.81115278  1.0017331   0.74908676  0.79561076  0.87198547  0.15587789
  -0.1028799 ]
 [ 0.79039161  0.79279758  0.88213638  0.78677447  0.39933179  0.98067694
   0.96810911  0.74908676  1.0017331   0.97923165  0.77923725  0.06054329
   0.05073429]
 [ 0.81945387  0.83000102  0.88845363  0.7705363   0.44688851  0.9973901
   0.9930931   0.79561076  0.97923165  1.0017331   0.79151729  0.10064689
   0.02915203]
 [ 0.57450366  0.92621663  0.69838314  0.54924869  0.4026556   0.79632212
   0.7937835   0.87198547  0.77923725  0.79151729  1.0017331   0.16948678
  -0.21544495]
 [ 0.22950946  0.15567302  0.13302493  0.05982616  0.14046405  0.0973812
   0.10171839  0.15587789  0.06054329  0.10064689  0.16948678  1.0017331
   0.08483985]
 [ 0.30578979 -0.09812221  0.16281709  0.44801852  0.10275195  0.0269373
  -0.00144679 -0.1028799   0.05073429  0.02915203 -0.21544495  0.08483985
   1.0017331 ]]
In [102]:
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)
Eigen Vectors 
%s [[-2.87744272e-01 -2.41032832e-01  6.27649237e-02  9.77860218e-02
   1.56509861e-01  7.43358742e-01  4.44048847e-01  2.61509536e-01
   4.66329308e-02 -2.31273307e-02 -2.47268539e-03  1.52839080e-02
  -3.38802245e-03]
 [-3.08664273e-01  2.08220597e-01  9.71793852e-02 -1.04673990e-01
  -4.17982161e-01  5.65950214e-02  4.26787107e-02 -1.41652245e-01
   2.49847491e-01  3.68887174e-02  1.14505559e-01 -3.77369803e-01
  -6.49512987e-01]
 [-3.16666413e-01 -1.13505380e-01 -1.42546384e-02 -1.21424273e-01
   2.49408923e-01 -2.66242051e-01 -2.35724189e-01  4.90809282e-01
   6.56473035e-01  1.22409372e-02 -2.44905510e-02 -3.75796601e-02
   9.60410025e-02]
 [-2.75675476e-01 -3.54955445e-01 -1.44529404e-01  1.07037973e-02
  -1.39682813e-01 -4.94240497e-01  6.58435168e-01 -2.38423244e-01
   5.54122052e-02  2.40265687e-02 -9.56448388e-04  5.11932120e-02
   1.28722278e-01]
 [-2.04424348e-01 -8.43304803e-02  2.81616497e-01 -7.94946844e-01
   2.92147144e-01 -6.48514607e-02  4.66895065e-03 -2.97156924e-03
  -3.74202432e-01  3.43027463e-02 -1.09687729e-03  1.10390284e-03
  -9.51728107e-02]
 [-3.30654571e-01  2.12631710e-02 -1.16888365e-01  1.38823374e-01
   1.68532854e-01 -2.94272967e-02 -1.47899564e-01 -1.76504754e-01
  -1.29678486e-01 -7.76071787e-01 -3.71936952e-01 -1.24424859e-01
  -4.53932222e-02]
 [-3.28381379e-01  4.31411016e-02 -1.07437329e-01  1.40499531e-01
   1.99903588e-01  3.06539960e-02 -1.71338483e-01 -2.04558931e-01
  -1.41030645e-01  2.94649794e-02  7.03202101e-01 -3.60220919e-01
   3.19232110e-01]
 [-3.00698070e-01  2.10258783e-01  1.41541268e-01 -2.19027526e-01
  -3.38195453e-01  2.40748711e-01 -1.49354156e-01 -3.88392228e-01
   2.78261164e-01 -4.89088602e-03 -6.09255188e-02  4.40382928e-01
   4.20441905e-01]
 [-3.20844301e-01  1.01097359e-03 -1.82843515e-01  2.23155997e-01
   1.43383214e-01 -9.27385318e-02 -1.52935090e-01  4.42765532e-02
  -1.85950538e-01  9.76296288e-02  2.27296349e-01  6.78525586e-01
  -4.50720620e-01]
 [-3.27761399e-01  2.15204353e-02 -1.25189153e-01  1.87812043e-01
   1.61821318e-01  2.68064748e-02 -1.68654387e-01 -1.77140235e-01
  -1.52945699e-01  6.18522557e-01 -5.44230866e-01 -2.23246768e-01
   4.94677928e-02]
 [-2.86863390e-01  3.11161706e-01  7.56649662e-02  7.63932046e-02
  -4.52894063e-01 -1.48283294e-01  6.06085612e-02  5.89813327e-01
  -4.13496583e-01 -2.67093337e-02 -3.95713843e-02 -2.84107260e-02
   2.37921579e-01]
 [-5.42606921e-02 -8.81220994e-02  8.86649631e-01  3.93128742e-01
   1.14330396e-01 -1.64287504e-01 -1.13381549e-02 -8.52539511e-02
   4.38724406e-03  2.58489470e-03  6.74624267e-03  2.81054610e-02
  -1.65083181e-02]
 [-2.60032461e-02 -7.75232605e-01 -7.11379393e-04 -1.54642052e-02
  -4.31345842e-01  8.23872337e-02 -4.27435808e-01  3.82792331e-02
  -1.35065516e-01 -1.41160001e-02  1.66125135e-02 -4.80965995e-02
  -7.33595876e-03]]

 Eigen Values 
%s [8.68934382e+00 1.48580159e+00 1.05518552e+00 7.89822717e-01
 4.10031137e-01 2.58862786e-01 1.27630266e-01 8.56019629e-02
 6.40135731e-02 2.60692995e-03 7.02446326e-03 2.62563803e-02
 2.03491851e-02]
In [103]:
print("Eigen Values:")
pd.DataFrame(eig_vals).transpose()
Eigen Values:
Out[103]:
0 1 2 3 4 5 6 7 8 9 10 11 12
0 8.689344 1.485802 1.055186 0.789823 0.410031 0.258863 0.12763 0.085602 0.064014 0.002607 0.007024 0.026256 0.020349
In [104]:
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)   # array of size =  as many PC dimensions
print("Cumulative Variance Explained", cum_var_exp)
Cumulative Variance Explained [ 66.72546425  78.13493344  86.23770223  92.30275024  95.45137902
  97.43918615  98.41925886  99.07659626  99.56815644  99.76977916
  99.92604054  99.97998139 100.        ]
In [105]:
# Ploting 
plt.figure(figsize=(15 , 6))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
In [106]:
# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eig_vals[index], eig_vecs[index]) for index in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue by default take first field for sorting
eig_pairs.sort(reverse=True)


# Note: always form pair of eigen vector and values  first before sorting...

# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eig_vals))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eig_vals))]
#Dimesionality reduction 

P_reduce = np.array(eigvectors_sorted[0:8]).transpose()   # Selecting first 8 eigen vector out if 18

Proj_train_data = np.dot(X_train_std,P_reduce)   # projecting training data onto the eight eigen vectors

Proj_test_data = np.dot(X_test_std,P_reduce)    # projecting test data onto the eight eigen vectors
#Check shapes of train and test new feature and target set after PCA
Proj_train_data.shape,y_train.shape,Proj_test_data.shape,y_test.shape
Out[106]:
((578, 8), (578,), (252, 8), (252,))
In [107]:
# Use SVM

from sklearn.svm import SVC

# Building a Support Vector Machine on train data
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(Proj_train_data, y_train)

prediction = svc_model.predict(Proj_test_data)
In [108]:
# check the accuracy on the training set
print(svc_model.score(Proj_train_data, y_train))
print(svc_model.score(Proj_test_data, y_test))
0.8166089965397924
0.8293650793650794
In [109]:
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
Confusion Matrix:
 [[ 34  13   4]
 [ 16 109   1]
 [  2   7  66]]
In [110]:
# Building a Support Vector Machine on train data
svc_model = SVC(kernel='rbf')
svc_model.fit(Proj_train_data, y_train_PCA)

prediction = svc_model.predict(Proj_test_data)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
In [111]:
print(svc_model.score(Proj_train_data, y_train_PCA))
print(svc_model.score(Proj_test_data, y_test_PCA))
0.9238754325259516
0.9047619047619048
In [112]:
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test_PCA))
Confusion Matrix:
 [[ 47   3   1]
 [  4 120   9]
 [  1   6  61]]

As we can see with the RBF kernel(radius bias function), we get a much accurate range of result on test data.

In [ ]: